%pip install xlrd
%pip install imblearn
# dataframe package
import pandas as pd
import numpy as np
from pandasql import sqldf
# Plotting Packages
import matplotlib.pyplot as plt
%matplotlib inline
# Viz Packages
import holoviews as hv
from holoviews import opts, dim
from pandas.plotting import parallel_coordinates
# ML Packages
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, roc_auc_score
# XGBoost model
from xgboost import XGBClassifier
# Oversampling package
from imblearn.over_sampling import SMOTE
hv.extension('bokeh')
clients = pd.read_excel("defaultofcreditcardclients.xls", header=1, index_col="ID")
I'll use the index column in the dataset and ignore the first row.
pd.set_option('display.max_columns', 30)
clients.head()
| LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | PAY_5 | PAY_6 | BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||||||||||||||
| 1 | 20000 | female | university | married | 24 | 2 | 2 | -1 | -1 | -2 | -2 | 3913 | 3102 | 689 | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 2 | 120000 | female | university | single | 26 | -1 | 2 | 0 | 0 | 0 | 2 | 2682 | 1725 | 2682 | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 3 | 90000 | female | university | single | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 29239 | 14027 | 13559 | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
| 4 | 50000 | female | university | married | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 46990 | 48233 | 49291 | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
| 5 | 50000 | male | university | married | 57 | -1 | 0 | -1 | 0 | 0 | 0 | 8617 | 5670 | 35835 | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
clients.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 30000 entries, 1 to 30000 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LIMIT_BAL 30000 non-null int64 1 SEX 30000 non-null object 2 EDUCATION 29669 non-null object 3 MARRIAGE 29677 non-null object 4 AGE 30000 non-null int64 5 PAY_0 30000 non-null int64 6 PAY_2 30000 non-null int64 7 PAY_3 30000 non-null int64 8 PAY_4 30000 non-null int64 9 PAY_5 30000 non-null int64 10 PAY_6 30000 non-null int64 11 BILL_AMT1 30000 non-null int64 12 BILL_AMT2 30000 non-null int64 13 BILL_AMT3 30000 non-null int64 14 BILL_AMT4 30000 non-null int64 15 BILL_AMT5 30000 non-null int64 16 BILL_AMT6 30000 non-null int64 17 PAY_AMT1 30000 non-null int64 18 PAY_AMT2 30000 non-null int64 19 PAY_AMT3 30000 non-null int64 20 PAY_AMT4 30000 non-null int64 21 PAY_AMT5 30000 non-null int64 22 PAY_AMT6 30000 non-null int64 23 default payment next month 30000 non-null int64 dtypes: int64(21), object(3) memory usage: 5.7+ MB
NULL values at columns: EDUCATION, MARRIAGE
clients.rename(columns={"PAY_0":"PAY_1", "default payment next month": "DEFAULTS"}, inplace=True)
Renamed columns PAY_0 and the target column for a more consistent names and easiness
clients["PAY_1"].value_counts()
0 14737 -1 5686 1 3688 -2 2759 2 2667 3 322 4 76 5 26 8 19 6 11 7 9 Name: PAY_1, dtype: int64
clients.hist(bins=50, figsize=(25,25), xrot=45)
plt.show()
A lot of outliers and higly skwed data. Another task to do during the data processing
Let's do more visualization
def stacked_bars_plot(data, attrs):
if len(attrs) < 4:
rows, cols = 1, len(attrs)
else:
rows = int(np.ceil(len(attrs) / 3))
cols = 3
fig, axs = plt.subplots(rows, cols, figsize = (25, rows * 10))
for column, ax in zip(attrs, axs.flatten()):
df = data.groupby('DEFAULTS')[column].value_counts(normalize=True).unstack(column)
df.plot.bar(stacked=True, ax=ax, title=f"Normalized Bar Chart for {column}")
plt.show()
cat_attrs = ["SEX", "EDUCATION", "MARRIAGE"]
stacked_bars_plot(clients, cat_attrs)
The distribution of these features seems similar, which suggets that there's no corrrelation between the demographs and whether someone will defaults or not.
pay_attrs = ["PAY_1", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
stacked_bars_plot(clients, pay_attrs)
clients.corr(numeric_only=True)["DEFAULTS"].sort_values(ascending=False)
DEFAULTS 1.000000 PAY_1 0.324794 PAY_2 0.263551 PAY_3 0.235253 PAY_4 0.216614 PAY_5 0.204149 PAY_6 0.186866 AGE 0.013259 BILL_AMT6 -0.005372 BILL_AMT5 -0.006760 BILL_AMT4 -0.010156 BILL_AMT3 -0.014076 BILL_AMT2 -0.014193 BILL_AMT1 -0.019644 PAY_AMT6 -0.053183 PAY_AMT5 -0.055124 PAY_AMT3 -0.056250 PAY_AMT4 -0.056827 PAY_AMT2 -0.058579 PAY_AMT1 -0.072929 LIMIT_BAL -0.153520 Name: DEFAULTS, dtype: float64
chord_df = sqldf(
"""SELECT DISTINCT PAY_2,
PAY_1,
COUNT(PAY_1) AS COUNT
FROM clients
GROUP BY PAY_2,
PAY_1
;
""", globals())
chord = hv.Chord(chord_df, kdims=["PAY_2", "PAY_1"], vdims=["COUNT"])
chord.opts(opts.Chord(width=800,
height=800,
labels=dim('index'),
cmap='Category20',
edge_color=dim('PAY_2'),
node_color= dim('index'),
title="The Payment Delay from August to September"
)
)
In the above chord diagram, I drew the change of the status of the clients between August and September. The chord diagram gives us an indication about how the clients behave. Most of the clients who payed minium on August did the same thing on September. Other insights can be deduced as well from this diagram.
Now, it's time to clean the data. We have two features with missing data
clients.EDUCATION.value_counts(dropna=False)
university 14030 graduate school 10585 high school 4916 NaN 331 others 124 0 14 Name: EDUCATION, dtype: int64
clients.MARRIAGE.value_counts(dropna=False)
single 15964 married 13659 NaN 323 0 54 Name: MARRIAGE, dtype: int64
clients.EDUCATION.replace({0: np.nan}, inplace=True)
clients.MARRIAGE.replace({0: np.nan}, inplace=True)
clients.EDUCATION.value_counts(dropna=False)
university 14030 graduate school 10585 high school 4916 NaN 345 others 124 Name: EDUCATION, dtype: int64
#instantiate both packages to use
encoder = OrdinalEncoder()
imputer = KNNImputer()
def encode(column):
'''function to encode non-null data and replace it in the original data'''
#retains only non-null values
nonulls = np.array(column.dropna())
#reshapes the data for encoding
impute_reshape = nonulls.reshape(-1,1)
#encode date
impute_ordinal = encoder.fit_transform(impute_reshape)
#Assign back encoded values to non-null values
column.loc[column.notnull()] = np.squeeze(impute_ordinal)
return column
#create a for loop to iterate through each column in the data
for column in cat_attrs:
clients[column] = encode(clients[column].copy())
%%time
# impute data and convert
encode_data = pd.DataFrame(np.round(imputer.fit_transform(clients)),
columns = clients.columns)
CPU times: user 4.74 s, sys: 15.1 s, total: 19.8 s Wall time: 2.49 s
encode_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30000 entries, 0 to 29999 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LIMIT_BAL 30000 non-null float64 1 SEX 30000 non-null float64 2 EDUCATION 30000 non-null float64 3 MARRIAGE 30000 non-null float64 4 AGE 30000 non-null float64 5 PAY_1 30000 non-null float64 6 PAY_2 30000 non-null float64 7 PAY_3 30000 non-null float64 8 PAY_4 30000 non-null float64 9 PAY_5 30000 non-null float64 10 PAY_6 30000 non-null float64 11 BILL_AMT1 30000 non-null float64 12 BILL_AMT2 30000 non-null float64 13 BILL_AMT3 30000 non-null float64 14 BILL_AMT4 30000 non-null float64 15 BILL_AMT5 30000 non-null float64 16 BILL_AMT6 30000 non-null float64 17 PAY_AMT1 30000 non-null float64 18 PAY_AMT2 30000 non-null float64 19 PAY_AMT3 30000 non-null float64 20 PAY_AMT4 30000 non-null float64 21 PAY_AMT5 30000 non-null float64 22 PAY_AMT6 30000 non-null float64 23 DEFAULTS 30000 non-null float64 dtypes: float64(24) memory usage: 5.5 MB
Some values in the age column were unresonable, some ages were above 140
invalid_ages = encode_data[encode_data["AGE"] > 100].index
invalid_ages
Int64Index([4011, 4116, 5395, 6963, 7318, 8940, 29496], dtype='int64')
encode_data.loc[invalid_ages, "AGE"] = 100
encode_data[encode_data["AGE"] > 100].index
Int64Index([], dtype='int64')
Now that we have cleaned our data, we have to fix the issue of imbalanced data. I'll use SMOTE to fix this issue
encode_data["DEFAULTS"].hist()
plt.show()
X, y = SMOTE().fit_resample(encode_data.drop("DEFAULTS", axis=1), encode_data["DEFAULTS"])
y.hist()
plt.show()
Now our two classes are balanced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
%%time
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
CPU times: user 5.59 s, sys: 4.07 ms, total: 5.6 s Wall time: 5.6 s
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
forest_y_pred = forest.predict(X_test)
print(classification_report(y_test, forest_y_pred))
precision recall f1-score support
0.0 0.85 0.93 0.89 7698
1.0 0.92 0.84 0.88 7723
accuracy 0.88 15421
macro avg 0.88 0.88 0.88 15421
weighted avg 0.88 0.88 0.88 15421
cm = confusion_matrix(y_test, forest_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_).plot()
plt.show()
params = {'max_depth': [10, 50,100],
'n_estimators': [100, 500, 1000]
}
%%time
forest_tuned = GridSearchCV(estimator=model,
param_grid=params,
scoring='recall',
verbose=0)
forest_tuned.fit(X_train, y_train)
CPU times: user 16min 13s, sys: 1.01 s, total: 16min 14s Wall time: 16min 14s
GridSearchCV(estimator=RandomForestClassifier(),
param_grid={'max_depth': [10, 50, 100],
'n_estimators': [100, 500, 1000]},
scoring='recall')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=RandomForestClassifier(),
param_grid={'max_depth': [10, 50, 100],
'n_estimators': [100, 500, 1000]},
scoring='recall')RandomForestClassifier()
RandomForestClassifier()
forest_tuned_y_pred = forest_tuned.predict(X_test)
print(classification_report(y_test, forest_tuned_y_pred))
precision recall f1-score support
0.0 0.85 0.92 0.88 7691
1.0 0.92 0.84 0.87 7730
accuracy 0.88 15421
macro avg 0.88 0.88 0.88 15421
weighted avg 0.88 0.88 0.88 15421
cm = confusion_matrix(y_test, forest_tuned_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest_tuned.classes_).plot()
plt.show()
%%timeit
xgmodel = XGBClassifier()
xgmodel.fit(X_train, y_train)
1.15 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
xg_y_pred = xgmodel.predict(X_test)
print(classification_report(y_test, xg_y_pred))
precision recall f1-score support
0.0 0.86 0.96 0.91 7698
1.0 0.96 0.85 0.90 7723
accuracy 0.90 15421
macro avg 0.91 0.90 0.90 15421
weighted avg 0.91 0.90 0.90 15421
cm = confusion_matrix(y_test, xg_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xgmodel.classes_).plot()
plt.show()
params = { 'max_depth': [3,6,10],
'learning_rate': [0.01, 0.08, 0.1],
'n_estimators': [100, 500, 1000],
'colsample_bytree': [0.3, 0.7, 1]
}
%%time
xg_tuned = GridSearchCV(estimator=xgmodel,
param_grid=params,
scoring='recall',
verbose=1)
xg_tuned.fit(X_train, y_train)
Fitting 5 folds for each of 81 candidates, totalling 405 fits CPU times: user 7h 14min 59s, sys: 1min 23s, total: 7h 16min 22s Wall time: 27min 28s
GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
callbacks=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise',
importance_type=None,
interaction_constraints='',
learning_rate=0.300000012, max_...
max_cat_threshold=64, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6,
max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=0,
num_parallel_tree=1, predictor='auto',
random_state=0, ...),
param_grid={'colsample_bytree': [0.3, 0.7, 1],
'learning_rate': [0.01, 0.08, 0.1],
'max_depth': [3, 6, 10],
'n_estimators': [100, 500, 1000]},
scoring='recall', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
callbacks=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise',
importance_type=None,
interaction_constraints='',
learning_rate=0.300000012, max_...
max_cat_threshold=64, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6,
max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=0,
num_parallel_tree=1, predictor='auto',
random_state=0, ...),
param_grid={'colsample_bytree': [0.3, 0.7, 1],
'learning_rate': [0.01, 0.08, 0.1],
'max_depth': [3, 6, 10],
'n_estimators': [100, 500, 1000]},
scoring='recall', verbose=1)XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)xg_tuned_y_pred = xg_tuned.predict(X_test)
print(classification_report(y_test, xg_tuned_y_pred))
precision recall f1-score support
0.0 0.84 0.93 0.88 7691
1.0 0.92 0.83 0.87 7730
accuracy 0.88 15421
macro avg 0.88 0.88 0.88 15421
weighted avg 0.88 0.88 0.88 15421
cm = confusion_matrix(y_test, xg_tuned_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xg_tuned.classes_).plot()
plt.show()
We can see that our xgmodel has the best performance and with the default parameters. Depending on our business case and the required accuracy, we can decide if furthur tuning should be done or not. Also, we might consider other model like ANN or Bayesian Inference.